/******************************************************************************
This file cleans the enrollment file for the analysis based on the demographic
files.
--------------------------------------------------------------------------------
Input:			Raw data set containing an overview on student's enrollment
					> {yyyy}_{yy}_June-Biog_PK-12_Scrambled.dta
--------------------------------------------------------------------------------
Output:			Cleaned demographic file
					> demo_all_years_all_grade.dta
				Demographics by grade
					> demo_info_{g}th_grade.dta"
********************************************************************************/

local append 1

clear all

* Append years
if `append' == 1 {
	set obs 1
	gen  year = .
	tempfile demos
	sa `demos'

	* Loop over years
	forval yr = 2005/2019 {

		local digits_plus = `yr' - 2000 + 1
		if length("`digits_plus'") == 1 local digits_plus 0`digits_plus'
		local next_yr = `yr' +  1

		* Load raw data
		use "${cleandata}demos/JuneBiog/`yr'-`digits_plus'_June-Biog_PK-12_Scrambled.dta", clear

		* Gen female info
		gen byte female = sex == "F"

		* Generate racial info
		gen byte asian 			= ethnicity == "Asian"
		gen byte black 			= ethnicity == "Black"
		gen byte hispanic 		= ethnicity == "Hispanic"
		gen byte nat_american	= ethnicity == "Native American"
		gen byte white	 		= ethnicity == "White"
		gen byte other 			= ethnicity == "."

		* Generate free and reduced price lunch info
		gen byte fr_lunch = real(poverty)

		* Gen spring year info
		gen year = `next_yr'

		append using `demos'

		save `demos', replace
	}

	* indicate on-time progression (did not repeat grade in the next year)
	bys stu grade_level (year): gen grade_attempt = _n
	egen max_attempts = max(grade_attempt), by(stu grade_level)
	g ontime = grade_attempt == max_attempts
	g ever_repeated_grade = max_attempts > 1
	drop grade_attempt max_attempts

	* Harmonize variable names
	ren (student_id grade_level dbn) (stu grade sch_long)

	* Create school code variable
	gen sch = substr(sch_long, 3, 4)

	* Drop if ID or grade is missing
	drop if stu == "" | grade == ""

	//isid  stu grade

	* Save one file with all years and first grade attempts
	sa "${cleandata}/demo_all_years_all_grade.dta", replace
}

use "${cleandata}/demo_all_years_all_grade.dta", clear

* Limit to first attempt in grade
bys stu grade: egen min_year = min(year)
drop if year != min_year
drop min_year

* Generate indicator for borough
gen enr_borough = substr(sch_long,3,1)

* Loop over grades that potentially serve as baseline grades
foreach gr in 08 {
	preserve
		* Limit sample to grade
		keep if grade == "`gr'"

		* Generate indicator for borough in grade
		gen enr_`gr'_borough = substr(sch_long, 3, 1)

		keep stu sch_long year asian black hispanic nat_american white other female swd ell fr_lunch enr_`gr'_borough ///
			days_abs days_pres days_released ontime ever_repeated_grade

		* Rename variables
		foreach var in asian black hispanic nat_american white other female swd ell fr_lunch days_abs days_pres days_released ontime ever_repeated_grade {
			ren `var' bl_`var'
		}

		* Generate indicator for borough names
		gen enr_`gr'_queens =  		enr_`gr'_borough == "Q"
		gen enr_`gr'_manhattan = 	enr_`gr'_borough == "M"
		gen enr_`gr'_brooklyn = 	enr_`gr'_borough == "K"
		gen enr_`gr'_staten = 		enr_`gr'_borough == "R"
		gen enr_`gr'_bronx = 		enr_`gr'_borough == "X"

		//isid stu
		* Save
		sa "${cleandata}demo_info_`gr'th_grade.dta", replace

	restore

}
